# load the census data
table.builder <- readRDS(file = './EDA/output/sa2_table_builder.rds')

#Missingness in the dataset

# check missingness in the original dataset
sapply(table.builder, function(x) sum(is.na(x)))
##            sa2_area           total_pop                male 
##                   0                   0                   3 
##              female            overseas           childcare 
##                   4                   4                   3 
##              employ         oc_positive         oc_negative 
##                   4                   7                   7 
##             sp_male           sp_female            sp_total 
##                   4                   4                   4 
##             student                age0               age10 
##                   4                   0                   0 
##               age20               age30               age40 
##                   0                   0                   0 
##               age50               age60               age70 
##                   0                   0                   0 
##               age80               age90              age100 
##                   0                   0                   0 
##   SA2_MAINCODE_2016 SA2_5DIGITCODE_2016       SA3_CODE_2016 
##                   2                   2                   2 
##       SA3_NAME_2016       SA4_CODE_2016       SA4_NAME_2016 
##                   2                   2                   2 
##          GCC_NAME16      time_to_orange       time_to_nowra 
##                   3                   7                   7 
##    time_to_canberra        time_to_port         time_to_rpa 
##                   7                   7                   7 
##    time_to_westmead       shortest_time 
##                   7                   7

#Prepare the Dataset

# Prepare the dataset for modelling
table.builder2 <- table.builder %>%
                    # create variable for the percentage of the population aged 60 and above
                    mutate(over60 = (rowSums(select(.,age60:age100)) / total_pop ) * 100 ) %>%
                    # drop the variables we wont need for the model
                    select(-c(age0,age10,age20,age30,age40,age50,age60,age70,age80,age90,age100,
                              SA2_MAINCODE_2016,SA2_5DIGITCODE_2016,SA3_CODE_2016,SA3_NAME_2016,
                              SA4_CODE_2016,SA4_NAME_2016, time_to_canberra, time_to_orange, time_to_nowra,
                              time_to_port, time_to_rpa, time_to_westmead)) %>%
                    # filter rows with no population
                    filter(total_pop > 0) %>%
                    # filter rows with missing outcome
                    filter(!is.na(shortest_time)) %>%
                    # Filter missing values from overcrowding stat
                    filter(!is.na(oc_positive)) %>%
                    # recreate outcome variable as numeric in minutes
                    mutate(st_min = as.numeric(shortest_time, 'minutes'))

#Distribution of the Outcome

ggplot(data = table.builder2, aes(x = st_min)) +
  geom_histogram() + 
  scale_x_continuous(breaks = seq(0,660,60))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Function for univariate Exploratory Data Analysis:

# scatter plots for each variable by the shortest time to an achd clinic
scatter.vars <- function(var) {
  #Make the plot
  ggplot(data = table.builder2, aes_string(x = 'st_min', y = var)) + 
    geom_point() +
    geom_smooth(color = "Blue", se=FALSE, size = 0.5) +
    geom_smooth(method = "lm", color = "red", se=FALSE, size = 0.5) +
    scale_x_continuous(breaks = seq(0,660,60)) +
    ggtitle(var)
}

vars <- c("total_pop","male", "female","overseas","childcare","employ","oc_positive","oc_negative","sp_male",                      "sp_female","sp_total","student","over60")  

#Modelling the entire dataset

##How is each variable related to the outcome?

There seems to be two different relationships occuring between the variables and the outcome. For travel times less than 120 mins, there is a stark change, either positive or negative, in some variables. After this time, however the relationship i smore contast.
- Positive change over the first 120 mins: houses with spare rooms, single parents households, over 60 popultion
- Negative change over the first 120 mins: Total population, percentage born overseas, houses with overcrowding, percentage of students
- No change: percentage of males or females

Unemployement doesnt show much change but the large outliers may be hiding this

lapply(vars, function(x) scatter.vars(x))
## [[1]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[2]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[3]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[4]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[5]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[6]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[7]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[8]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[9]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[10]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[11]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[12]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## 
## [[13]]
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

check unemployment without outliers

  ggplot(data = table.builder2, aes(x = st_min, y = employ)) + 
    geom_point() +
    geom_smooth(color = "Blue", se=FALSE, size = 0.5) +
    geom_smooth(method = "lm", color = "red", se=FALSE, size = 0.5) +
    scale_x_continuous(breaks = seq(0,660,60)) +
    scale_y_continuous(limits = c(0,7.5)) +
    ggtitle("unemployement")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

##Fit Univariate Linear Models

#functions to fit univariate glms
univ.lm <- function(outcome, variables) {
  # list to store output tables
  out <- list()
  # count to add to list
  count <- 1
  # loop through the variables
  for (i in variables) { 
    f <- as.formula(paste(outcome, i, sep = '~'))
    # fit the glm model and add to list
    out[[count]] <- lm(f, data=table.builder2)
    #add to the count
    count <- count + 1
  }
  return(out)
}
#fit an univarite model for each variable
uni.mods <- univ.lm("st_min", vars)

# display the coefficients and p values for each univariate model
for (i in 1:length(vars)) {
  print(summary(uni.mods[[i]])$coefficients)
}
##                  Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept) 158.078500538 8.6917048406 18.187284 1.438701e-58
## total_pop    -0.005240542 0.0005839859 -8.973747 4.181821e-18
##               Estimate Std. Error    t value  Pr(>|t|)
## (Intercept) 105.055076  65.938087  1.5932382 0.1116641
## male         -0.316832   1.323258 -0.2394334 0.8108561
##               Estimate Std. Error   t value  Pr(>|t|)
## (Intercept) 70.5954260  58.357306 1.2097102 0.2268942
## female       0.3718028   1.156598 0.3214624 0.7479784
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 189.730903  8.0771513  23.48983 1.000497e-85
## overseas     -3.248089  0.2309918 -14.06149 9.626960e-39
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 128.642187  22.737256  5.657771 2.435490e-08
## childcare    -1.799417   1.020624 -1.763055 7.842986e-02
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 96.149086   7.422959 12.952933 8.483542e-34
## employ      -2.313486   2.025953 -1.141925 2.539672e-01
##               Estimate Std. Error  t value   Pr(>|t|)
## (Intercept) 32.2387408 19.7988803 1.628311 0.10401437
## oc_positive  0.9801954  0.3317902 2.954263 0.00326424
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 124.289986   5.839790 21.283298 2.543746e-74
## oc_negative  -9.352942   1.104309 -8.469496 2.122294e-16
##             Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 19.58517   11.57320 1.692286 9.114077e-02
## sp_male     91.27459   14.10469 6.471223 2.104604e-10
##             Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 52.37748  13.453574 3.893202 0.0001107016
## sp_female   11.13407   3.838928 2.900308 0.0038725146
##             Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 38.31050  13.844817 2.767136 0.0058397265
## sp_total    12.49739   3.223442 3.877033 0.0001181149
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 242.549810 19.5462432 12.409024 1.889368e-31
## student      -6.593067  0.8217367 -8.023333 5.940304e-15
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -20.069214 12.4669534 -1.609793 1.079996e-01
## over60        4.650473  0.5008792  9.284621 3.421759e-19

##Build a model with the variables that are significant in the univariate models

sa2.lm <- lm(st_min ~ total_pop + overseas + childcare + employ + oc_positive + 
                          oc_negative + sp_male + sp_female + student + over60,
                          data = table.builder2)

summary(sa2.lm)
## 
## Call:
## lm(formula = st_min ~ total_pop + overseas + childcare + employ + 
##     oc_positive + oc_negative + sp_male + sp_female + student + 
##     over60, data = table.builder2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -297.05  -46.14  -16.01   21.18  488.97 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.156e+02  4.208e+01   7.500 2.53e-13 ***
## total_pop   -1.667e-03  6.272e-04  -2.657 0.008108 ** 
## overseas    -3.931e+00  4.374e-01  -8.987  < 2e-16 ***
## childcare   -4.620e+00  1.299e+00  -3.556 0.000409 ***
## employ      -1.972e+00  1.939e+00  -1.017 0.309696    
## oc_positive -3.093e-01  5.380e-01  -0.575 0.565563    
## oc_negative  4.511e+00  1.862e+00   2.423 0.015726 *  
## sp_male      1.951e+01  1.670e+01   1.169 0.243028    
## sp_female    3.671e+00  4.735e+00   0.775 0.438426    
## student     -4.700e-01  1.187e+00  -0.396 0.692415    
## over60       3.804e-01  6.606e-01   0.576 0.565015    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 85.12 on 558 degrees of freedom
## Multiple R-squared:  0.347,  Adjusted R-squared:  0.3353 
## F-statistic: 29.66 on 10 and 558 DF,  p-value: < 2.2e-16

##Build another model with the sigificant variables from sa2.lm

reduced.sa2.lm <- lm(st_min ~ total_pop + overseas + childcare + oc_negative, data = table.builder2)

summary(reduced.sa2.lm)
## 
## Call:
## lm(formula = st_min ~ total_pop + overseas + childcare + oc_negative, 
##     data = table.builder2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -322.71  -46.60  -15.32   17.66  497.74 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.413e+02  2.294e+01  14.879  < 2e-16 ***
## total_pop   -1.772e-03  5.909e-04  -2.999 0.002827 ** 
## overseas    -4.341e+00  3.743e-01 -11.600  < 2e-16 ***
## childcare   -5.263e+00  9.211e-01  -5.714 1.79e-08 ***
## oc_negative  5.506e+00  1.559e+00   3.532 0.000446 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 85.29 on 564 degrees of freedom
## Multiple R-squared:  0.3375, Adjusted R-squared:  0.3328 
## F-statistic: 71.82 on 4 and 564 DF,  p-value: < 2.2e-16
anova(reduced.sa2.lm, sa2.lm)
## Analysis of Variance Table
## 
## Model 1: st_min ~ total_pop + overseas + childcare + oc_negative
## Model 2: st_min ~ total_pop + overseas + childcare + employ + oc_positive + 
##     oc_negative + sp_male + sp_female + student + over60
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1    564 4102560                           
## 2    558 4043203  6     59358 1.3653 0.2265

##Model Diagnositcs Checking the residuals - homoskedacisty

# load the olsrr library
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
# residuals versus fitted values plot
ols_plot_resid_fit(reduced.sa2.lm)

# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(reduced.sa2.lm, rhs=TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                        Data                         
##  ---------------------------------------------------
##  Response : st_min 
##  Variables: total_pop overseas childcare oc_negative 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    4 
##  Chi2          =    251.3026 
##  Prob > Chi2   =    3.411563e-53
table.builder2$residuals <- residuals(reduced.sa2.lm)
ggplot(data = table.builder2, aes(x = st_min, y = residuals)) +
    geom_point() +
    scale_x_continuous(breaks = seq(0,660,60)) +
    ggtitle("Residuals vs Outcome")

ggplot(data = table.builder2, aes(x = total_pop, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs Total Population")

ggplot(data = table.builder2, aes(x = overseas, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs % of population born overseas")

ggplot(data = table.builder2, aes(x = childcare, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs % of population in chlidcare")

ggplot(data = table.builder2, aes(x = oc_negative, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs % of overcrowded households")

Checking the residuals - normality

qqnorm(residuals(reduced.sa2.lm), ylab="Residuals")
qqline(residuals(reduced.sa2.lm))

hist(residuals(reduced.sa2.lm), breaks = 50)

shapiro.test(residuals(reduced.sa2.lm))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(reduced.sa2.lm)
## W = 0.84545, p-value < 2.2e-16

reject the null hypothesis that the residuals are normal

Checking for influentiak Observations

library(faraway)
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## 
## Attaching package: 'faraway'
## The following object is masked from 'package:olsrr':
## 
##     hsb
faraway::halfnorm(cooks.distance(reduced.sa2.lm), nlab = 1, ylab="cooks distance", lab = table.builder2$sa2_area)

# refit the model without the port botany industrial area
reduced.sa2.lm.2 <- lm(st_min ~ total_pop + overseas + childcare + oc_negative, data = table.builder2, subset = (cooks.distance(reduced.sa2.lm) < 0.02))

summary(reduced.sa2.lm.2)
## 
## Call:
## lm(formula = st_min ~ total_pop + overseas + childcare + oc_negative, 
##     data = table.builder2, subset = (cooks.distance(reduced.sa2.lm) < 
##         0.02))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -175.67  -44.82  -13.55   20.22  359.98 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.771e+02  2.336e+01  16.145  < 2e-16 ***
## total_pop   -1.781e-03  5.359e-04  -3.323 0.000949 ***
## overseas    -4.262e+00  3.438e-01 -12.395  < 2e-16 ***
## childcare   -6.899e+00  9.326e-01  -7.398 5.12e-13 ***
## oc_negative  4.665e+00  1.424e+00   3.276 0.001118 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77.06 on 557 degrees of freedom
## Multiple R-squared:  0.3823, Adjusted R-squared:  0.3778 
## F-statistic: 86.18 on 4 and 557 DF,  p-value: < 2.2e-16
# residuals versus fitted values plot
ols_plot_resid_fit(reduced.sa2.lm.2)

# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(reduced.sa2.lm.2, rhs=TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                        Data                         
##  ---------------------------------------------------
##  Response : st_min 
##  Variables: total_pop overseas childcare oc_negative 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    4 
##  Chi2          =    156.8503 
##  Prob > Chi2   =    6.923801e-33

This model breaks a fair few assumptions, constant variance and normality of the residuals. Also the Residual standard error is quite high and the R2 quite low. suggesting a poor fit. The initial EDA also suggested the data was not linear and the spread of the outcome is not normal. It might be that the sydney population and rural population have very different circumstance. Perhaps sepaerating them would help.

Also, judging by the relationship between the residuals and the predictors, some of the very low population areas might be causing an issue for the model fit, so we can try remove areas with population less than 100

#Distribution of the Outcome separated by GCC Name

table.builder2$GCC_NAME16 <- as.factor(table.builder2$GCC_NAME16)

table.builder.syd <- table.builder2 %>% filter(GCC_NAME16 == "Greater Sydney") %>% filter(total_pop > 100)
table.builder.nsw <- table.builder2 %>% filter(GCC_NAME16 == "Rest of NSW") %>% filter(total_pop > 100)
ggplot(data = table.builder.syd, aes(x = st_min)) +
  geom_histogram() + 
  scale_x_continuous(breaks = seq(0,660,60))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = table.builder.nsw, aes(x = st_min)) +
  geom_histogram() + 
  scale_x_continuous(breaks = seq(0,660,60))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Function for univariate Exploratory Data Analysis, separated by urban/regional SA2 areas:

# scatter plots for each variable by the shortest time to an achd clinic
scatter.vars.grouped <- function(var) {
  #plot the boxplot
  ggplot(data = table.builder2, aes_string(x = 'st_min', y = var, color = 'GCC_NAME16')) + 
    geom_point(size = 0.25) +
    geom_smooth(method = 'lm', size = 1.25) +
    scale_x_continuous(breaks = seq(0,660,60)) +
    ggtitle(var)
}

vars <- c("total_pop","male", "female","overseas","childcare","employ","oc_positive","oc_negative","sp_male",                      "sp_female","sp_total","student","over60")  
lapply(vars, function(x) scatter.vars.grouped(x))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

#Modelling the Sydney dataset

##Fit Univariate Linear Models

#functions to fit univariate regression models
univ.lm <- function(outcome, variables) {
  # list to store output tables
  out <- list()
  # count to add to list
  count <- 1
  # loop through the variables
  for (i in variables) { 
    f <- as.formula(paste(outcome, i, sep = '~'))
    # fit the glm model and add to list
    out[[count]] <- lm(f, data=table.builder.syd)
    #add to the count
    count <- count + 1
  }
  return(out)
}
#fit an univarite model for each variable
uni.mods <- univ.lm("st_min", vars)

# display the coefficients and p values for each univariate model
for (i in 1:length(vars)) {
  print(summary(uni.mods[[i]])$coefficients)
}
##                 Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept) 46.041926293 2.9772870656 15.464389 5.972558e-40
## total_pop   -0.001128612 0.0001702919 -6.627511 1.612969e-10
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 121.375688 36.3150285  3.342299 0.0009376152
## male         -1.899057  0.7363815 -2.578904 0.0103942361
##               Estimate Std. Error   t value   Pr(>|t|)
## (Intercept) -68.316964 37.3913563 -1.827079 0.06869473
## female        1.894836  0.7369582  2.571158 0.01062433
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 65.318104 2.48680649  26.26586 2.552974e-79
## overseas    -0.917315 0.05683346 -16.14040 1.778215e-42
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -13.851210  6.7860564 -2.041128 4.212482e-02
## childcare     1.850531  0.2974519  6.221277 1.679462e-09
##             Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 40.19824   3.879058 10.36289 1.130627e-21
## employ      -4.21053   1.251856 -3.36343 8.711515e-04
##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -10.4877389 4.84618456 -2.164123 3.125448e-02
## oc_positive   0.6698549 0.08266751  8.103001 1.422453e-14
##              Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 38.666989  1.7100152 22.61207 1.878443e-66
## oc_negative -2.006895  0.2431683 -8.25311 5.150880e-15
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -4.950695   4.014545 -1.233190 2.184835e-01
## sp_male     47.995684   5.670231  8.464503 1.209079e-15
##             Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 13.18286   3.649637 3.612103 3.566871e-04
## sp_female    4.48312   1.062118 4.220924 3.240008e-05
##             Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 10.04424  3.7920787 2.648742 8.512744e-03
## sp_total     4.50395  0.9172225 4.910422 1.505024e-06
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 57.830494  7.9389335  7.284416 2.942134e-12
## student     -1.177976  0.3076969 -3.828365 1.574436e-04
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -9.821078  3.7569606 -2.614102 9.403914e-03
## over60       1.920408  0.1845368 10.406640 8.079510e-22

##Build a model with the variables that are significant in the univariate models

significant variables in univariate model: total_pop, male, female, overseas, childcare, employ, oc_positive, oc_negative, sp_male, sp_female, sp_total, student, over_60

sa2.lm <- lm(st_min ~ total_pop + male + female + overseas + childcare + employ + oc_positive + 
                          oc_negative + sp_male + sp_female + sp_total+  student + over60,
                          data = table.builder.syd)

summary(sa2.lm)
## 
## Call:
## lm(formula = st_min ~ total_pop + male + female + overseas + 
##     childcare + employ + oc_positive + oc_negative + sp_male + 
##     sp_female + sp_total + student + over60, data = table.builder.syd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.559  -8.637  -0.813   7.704  37.865 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.543e+03  2.109e+03  -0.731  0.46508    
## total_pop   -3.484e-04  1.165e-04  -2.991  0.00302 ** 
## male         1.613e+01  2.110e+01   0.765  0.44517    
## female       1.528e+01  2.108e+01   0.725  0.46893    
## overseas    -1.200e+00  1.064e-01 -11.275  < 2e-16 ***
## childcare    8.475e-01  3.601e-01   2.353  0.01928 *  
## employ       1.372e+01  1.797e+00   7.637 3.39e-13 ***
## oc_positive -2.778e-01  1.384e-01  -2.007  0.04572 *  
## oc_negative -3.133e-01  4.314e-01  -0.726  0.46827    
## sp_male      4.034e+00  8.171e+00   0.494  0.62194    
## sp_female   -2.297e+00  1.530e+00  -1.502  0.13432    
## sp_total            NA         NA      NA       NA    
## student     -1.505e-01  2.890e-01  -0.521  0.60298    
## over60       1.111e+00  1.831e-01   6.066 4.15e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.06 on 285 degrees of freedom
## Multiple R-squared:  0.677,  Adjusted R-squared:  0.6634 
## F-statistic: 49.78 on 12 and 285 DF,  p-value: < 2.2e-16

##Build another model with the sigificant variables from sa2.lm

reduced.sa2.lm <- lm(st_min ~ total_pop + overseas + childcare + employ + oc_positive + over60, data = table.builder.syd)

summary(reduced.sa2.lm)
## 
## Call:
## lm(formula = st_min ~ total_pop + overseas + childcare + employ + 
##     oc_positive + over60, data = table.builder.syd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.329  -8.640  -0.591   7.584  39.434 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 35.0429430  9.5181992   3.682 0.000276 ***
## total_pop   -0.0004153  0.0001136  -3.656 0.000304 ***
## overseas    -1.2178819  0.0797015 -15.281  < 2e-16 ***
## childcare    0.6534680  0.3356037   1.947 0.052480 .  
## employ      11.1252769  1.0288700  10.813  < 2e-16 ***
## oc_positive -0.3006557  0.1073464  -2.801 0.005439 ** 
## over60       0.9676831  0.1515301   6.386 6.72e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.13 on 291 degrees of freedom
## Multiple R-squared:  0.6661, Adjusted R-squared:  0.6592 
## F-statistic: 96.76 on 6 and 291 DF,  p-value: < 2.2e-16
anova(reduced.sa2.lm, sa2.lm)
## Analysis of Variance Table
## 
## Model 1: st_min ~ total_pop + overseas + childcare + employ + oc_positive + 
##     over60
## Model 2: st_min ~ total_pop + male + female + overseas + childcare + employ + 
##     oc_positive + oc_negative + sp_male + sp_female + sp_total + 
##     student + over60
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    291 42818                           
## 2    285 41423  6    1394.8 1.5995 0.1471

##Build a further reduced model

reduced.sa2.lm.2 <- lm(st_min ~ total_pop + overseas + employ + over60, data = table.builder.syd)

summary(reduced.sa2.lm.2)
## 
## Call:
## lm(formula = st_min ~ total_pop + overseas + employ + over60, 
##     data = table.builder.syd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.213  -8.363  -1.088   7.142  40.711 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.4417669  5.1305418   5.933 8.36e-09 ***
## total_pop   -0.0003898  0.0001143  -3.411 0.000738 ***
## overseas    -1.1370888  0.0672525 -16.908  < 2e-16 ***
## employ      10.9293422  1.0107852  10.813  < 2e-16 ***
## over60       0.9159013  0.1503025   6.094 3.47e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.25 on 293 degrees of freedom
## Multiple R-squared:  0.6571, Adjusted R-squared:  0.6524 
## F-statistic: 140.4 on 4 and 293 DF,  p-value: < 2.2e-16
anova(reduced.sa2.lm.2, sa2.lm)
## Analysis of Variance Table
## 
## Model 1: st_min ~ total_pop + overseas + employ + over60
## Model 2: st_min ~ total_pop + male + female + overseas + childcare + employ + 
##     oc_positive + oc_negative + sp_male + sp_female + sp_total + 
##     student + over60
##   Res.Df   RSS Df Sum of Sq      F  Pr(>F)  
## 1    293 43972                              
## 2    285 41423  8    2549.2 2.1924 0.02811 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Analysis of variance suggests that reducing the model further does help to imporve the residual errors, so we will go with the second reduced model.

reduced.sa2.lm <- lm(st_min ~ total_pop + overseas + employ + oc_positive + over60, data = table.builder.syd)

##Model Diagnoistics

# residuals versus fitted values plot
ols_plot_resid_fit(reduced.sa2.lm.2)

# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(reduced.sa2.lm.2, rhs=TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                    Data                     
##  -------------------------------------------
##  Response : st_min 
##  Variables: total_pop overseas employ over60 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    4 
##  Chi2          =    26.86737 
##  Prob > Chi2   =    2.114471e-05
table.builder.syd$residuals <- residuals(reduced.sa2.lm.2)
ggplot(data = table.builder.syd, aes(x = st_min, y = residuals)) +
    geom_point() +
    scale_x_continuous(breaks = seq(0,660,60)) +
    ggtitle("Residuals vs Outcome")

ggplot(data = table.builder.syd, aes(x = total_pop, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs Total Population")

ggplot(data = table.builder.syd, aes(x = overseas, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs % of population born overseas")

ggplot(data = table.builder.syd, aes(x = childcare, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs % of population engaged in childcare")

ggplot(data = table.builder.syd, aes(x = employ, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs % of population unemployed")

ggplot(data = table.builder.syd, aes(x = oc_positive, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs % of undercrowded households")

ggplot(data = table.builder.syd, aes(x = over60, y = residuals)) +
    geom_point() +
    ggtitle("Residuals vs % of population over 60 years")

# test model with transformation in 'overseas'
lm.test <- lm(st_min ~ total_pop + I(log(overseas)) + employ + oc_positive + over60, data = table.builder.syd)

summary(lm.test)
## 
## Call:
## lm(formula = st_min ~ total_pop + I(log(overseas)) + employ + 
##     oc_positive + over60, data = table.builder.syd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -30.238  -7.639  -0.150   6.614  36.480 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.505e+02  1.229e+01  12.245  < 2e-16 ***
## total_pop        -3.081e-04  1.076e-04  -2.864  0.00448 ** 
## I(log(overseas)) -4.330e+01  2.482e+00 -17.448  < 2e-16 ***
## employ            9.136e+00  9.075e-01  10.067  < 2e-16 ***
## oc_positive      -1.043e-01  7.063e-02  -1.476  0.14094    
## over60            9.453e-01  1.393e-01   6.786 6.42e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.45 on 292 degrees of freedom
## Multiple R-squared:  0.7014, Adjusted R-squared:  0.6963 
## F-statistic: 137.2 on 5 and 292 DF,  p-value: < 2.2e-16
# residuals versus fitted values plot
ols_plot_resid_fit(lm.test)

# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(lm.test, rhs=TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                              Data                               
##  ---------------------------------------------------------------
##  Response : st_min 
##  Variables: total_pop I(log(overseas)) employ oc_positive over60 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    5 
##  Chi2          =    62.9082 
##  Prob > Chi2   =    3.041438e-12
plot(table.builder.syd$overseas, residuals(lm.test))

#Modelling the Rest of NSW dataset

##Fit Univariate Linear Models

#functions to fit univariate regression models
univ.lm <- function(outcome, variables) {
  # list to store output tables
  out <- list()
  # count to add to list
  count <- 1
  # loop through the variables
  for (i in variables) { 
    f <- as.formula(paste(outcome, i, sep = '~'))
    # fit the glm model and add to list
    out[[count]] <- lm(f, data=table.builder.nsw)
    #add to the count
    count <- count + 1
  }
  return(out)
}
#fit an univarite model for each variable
uni.mods <- univ.lm("st_min", vars)

# display the coefficients and p values for each univariate model
for (i in 1:length(vars)) {
  print(summary(uni.mods[[i]])$coefficients)
}
##                  Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept) 198.599210086 15.142710179 13.115169 2.097946e-30
## total_pop    -0.003558164  0.001300985 -2.734977 6.674453e-03
##               Estimate Std. Error   t value   Pr(>|t|)
## (Intercept) -294.19589 249.711212 -1.178144 0.23983300
## male           9.21502   5.040391  1.828235 0.06867794
##               Estimate Std. Error   t value   Pr(>|t|)
## (Intercept) 617.101750 255.098726  2.419070 0.01625803
## female       -9.012787   5.051497 -1.784181 0.07557849
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 199.490511   29.47895  6.767218 8.900618e-11
## overseas     -1.959859    1.49923 -1.307244 1.923029e-01
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 349.395280  51.755581  6.750872 9.794638e-11
## childcare    -8.650584   2.368331 -3.652607 3.147944e-04
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) 167.311910  28.767559  5.8159927 1.791002e-08
## employ       -1.817301   9.783957 -0.1857429 8.527934e-01
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 370.926687 57.8782714  6.408738 7.007389e-10
## oc_positive  -3.408254  0.9376716 -3.634806 3.361916e-04
##                Estimate Std. Error     t value     Pr(>|t|)
## (Intercept) 163.1853599  19.548346  8.34778336 4.393029e-15
## oc_negative  -0.5264033   9.159154 -0.05747292 9.542133e-01
##              Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 120.82891   34.41101 3.511345 0.0005267703
## sp_male      47.09133   38.33828 1.228311 0.2204583679
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) 161.82235870  28.881005 5.60307219 5.431385e-08
## sp_female     0.09066583   7.905044 0.01146936 9.908579e-01
##               Estimate Std. Error   t value    Pr(>|t|)
## (Intercept) 154.850076  31.925134 4.8504127 2.14137e-06
## sp_total      1.652748   7.044041 0.2346306 8.14683e-01
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 268.496841  40.943106  6.557803 2.998383e-10
## student      -4.985907   1.889582 -2.638630 8.834410e-03
##                Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) 176.1904125  29.267957  6.0199081 6.021070e-09
## over60       -0.5005823   1.010131 -0.4955618 6.206287e-01

##Build a model with the variables that are significant in the univariate models

significant variables in univariate model: total_pop, childcare, oc_positive, student

sa2.lm <- lm(st_min ~ total_pop + childcare + oc_positive + student, data = table.builder.nsw)

summary(sa2.lm)
## 
## Call:
## lm(formula = st_min ~ total_pop + childcare + oc_positive + student, 
##     data = table.builder.nsw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -198.84  -85.30  -23.41   63.64  491.11 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 386.991930  61.913748   6.251 1.73e-09 ***
## total_pop    -0.001948   0.001421  -1.371    0.171    
## childcare    -4.958657   3.578752  -1.386    0.167    
## oc_positive  -1.597960   1.277203  -1.251    0.212    
## student       0.015862   2.476937   0.006    0.995    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113.8 on 253 degrees of freedom
## Multiple R-squared:  0.06686,    Adjusted R-squared:  0.05211 
## F-statistic: 4.532 on 4 and 253 DF,  p-value: 0.001492

Perhaps this suggests that there is not much relationship between the outcome and predictors?

##Model Diagnositics

# residuals versus fitted values plot
ols_plot_resid_fit(sa2.lm)

# Breusch-Pagan test from olsrr library
ols_test_breusch_pagan(sa2.lm, rhs=TRUE)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                        Data                        
##  --------------------------------------------------
##  Response : st_min 
##  Variables: total_pop childcare oc_positive student 
## 
##          Test Summary          
##  ------------------------------
##  DF            =    4 
##  Chi2          =    32.01021 
##  Prob > Chi2   =    1.90393e-06